Setup and preprocessing
library(here)
library(rethinking)
library(tidyverse); print(packageVersion("tidyverse"))
library(phyloseq)
library(bayesplot)
library(tidybayes)
library(ggbeeswarm)
library(magrittr)
library(tsibble)
library(rstan)
library(ggridges)
library(patchwork)
# library(kableExtra)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
fig_fp <- params$fig_fp
if (!dir.exists(here(fig_fp))) {
dir.create(here(fig_fp))
}
source(here("shared_functions.R"))
mira_theme <- theme_bw() + theme(
strip.background = element_rect(fill = NA)
)
theme_set(mira_theme)We first load all the data into one object. This contains metadata related to each specimen, and the sequence and taxonomy table for all amplicon sequence variants (ASVs).
mira.all <- load_mira_data(
seqtab_fp = here("../shared/data/seqtab.rds"),
taxa_fp = here("../shared/data/taxa.rds"),
meds_fp = here("../shared/data/MIRA_Medications_Table.csv"),
specimen_fp = here("../shared/data/MIRA_Specimen_Table.csv")
)
seqs <- mira.all$seqs
meds <- mira.all$medsHere we filter and preprocess the data. We first remove non-informative samples, merge duplicates, and restrict the data to the taxa of interest (Staphylococcus aureus). Next, we calculate our “antibiotic effectiveness window” for each subject and the antibiotic of interest (vancomyciniv).
mira <- mira.all$ps
# Remove non-MIRA subjects from dataset (incl. blanks and controls)
.nonmira <- is.na(sample_data(mira)$subject_id)
print(sprintf("Removing %d non-MIRA samples...", sum(.nonmira)))
mira <- prune_samples(!.nonmira, mira)
# Remove culture samples (they break the subject/type/date unique constraint)
.culture <- grepl("Culture", sample_data(mira)$specimen_type)
print(sprintf("Removing %d culture samples...", sum(.culture)))
mira <- prune_samples(!.culture, mira)
# Remove empty samples
.empty <- sample_sums(mira) == 0
print(sprintf("Removing %d empty samples...", sum(.empty)))
mira <- prune_samples(!.empty, mira)
# Identify "duplicated" specimens (same subject, specimen type, and study day)
sample_data(mira) <- sample_data(mira) %>%
group_by(subject_id, specimen_type, study_day) %>%
# "specimen_id3" has the same value for duplicated specimens so phyloseq can
# use it as a grouping level
mutate(specimen_id3 = case_when(
n() > 1 ~ paste0(first(as.character(specimen_id2)), "_D"),
TRUE ~ as.character(specimen_id2)
)) %>%
ungroup() %>% as.data.frame() %>%
set_rownames(.$specimen_id2)
# Sum abundances and merge sample table for duplicates
mira <- phyloseq::merge_samples(mira, "specimen_id3", fun=sum)
# Re-add the relevant metadata since merge_samples mangles factors and dates
sample_data(mira) <- sample_data(mira) %>%
mutate(specimen_id2 = rownames(.)) %>%
select(specimen_id2, specimen_id3) %>%
left_join(mira.all$samples) %>%
ungroup() %>% as.data.frame() %>%
set_rownames(.$specimen_id2)
# Restrict to only samples for which we have abx data
.abx_specimens <- as.character(inner_join(sample_data(mira), meds)$specimen_id2)
mira.abx <- prune_samples(
sample_data(mira)$specimen_id2 %in% .abx_specimens, mira)
# Converted to melted form
agg <- phyloseq_to_agglomerated(mira.abx, "specimen_id2", "otu_id", "read_count")
d <- agg %>%
# Calculate total reads
group_by(specimen_id2) %>%
mutate(total_reads = sum(read_count)) %>%
ungroup() %>%
# Collapse reads by genus/species
filter(!is.na(Genus), !is.na(Species)) %>%
group_by(specimen_id2, total_reads, Kingdom, Phylum, Class, Order, Family, Genus, Species) %>%
summarise(read_count = sum(read_count)) %>%
ungroup() %>%
filter(Genus == params$genus) %>%
filter(Species == params$species) %>%
left_join(sample_data(mira.abx)) %>%
select(-c(Kingdom:Family))
# Calculate antibiotic effectiveness windows
subjects <- sample_data(mira.abx) %>%
group_by(subject_id) %>%
mutate(exit_date = max(collection_date)) %>%
distinct(subject_id, enroll_date, exit_date) %>%
right_join(meds) %>%
group_by(subject_id) %>%
mutate(study_day = as.integer(collection_date - enroll_date)) %>%
mutate(exit_day = study_day[collection_date == exit_date]) %>%
# Limit to only a week before enrollment date and nothing after
filter(study_day > -3, collection_date <= exit_date) %>%
mutate(abx_yn = grepl(params$abx, abx_b)) %>%
# the .size parameter here is how long it takes to reach peak
# 1 = that day
mutate(reached_peak = slide_lgl(abx_yn, all, .size=2)) %>%
# the lag parameter here defines how long it lasts after end of admin.
mutate(on_abx = effective_window(reached_peak, lag=1)) %>%
ungroup()
# Manually split MIRA_024 into two sub-subjects
subjects <- subjects %>%
mutate(subject_id2 = case_when(
subject_id != "MIRA_024" ~ subject_id,
study_day <= 33 ~ "MIRA_024a",
study_day >= 73 ~ "MIRA_024b",
TRUE ~ NA_character_
)) %>%
filter(!is.na(subject_id2)) %>%
mutate(study_day = case_when(
subject_id2 == "MIRA_024b" ~ study_day - 73,
TRUE ~ as.double(study_day)
)) %>%
mutate(exit_day = ifelse(subject_id2 == "MIRA_024b", exit_day-73, exit_day))
d2 <- left_join(subjects, d)The following figure shows the timecourses for each subject, including the antibiotic administration periods and the days each specimen type was collected.
Model v1.5
We will use a binomial model of reads attributable to our bacteria of interest in a given specimen \(i\) from a subject \(j\) (\(y_{ij}\)), given the number of total reads in the specimen (\(T_i\)) and some probability \(p_{ij}\) of seeing that bacteria: \[ y_{ij} \sim \text{Binom}(T_i,\ p_{ij}) \] We model the probability \(p_{ij}\) as a linear combination of our predictors, which are previous day’s abundance of that bacteria in subject \(\beta_{\text{prev}}\) and whether or not the subject is under the effects of antibiotics \(\beta_{\text{abx}_j}\). In addition, we include a global intercept \(\alpha_0\), a subject-specific intercept \(\alpha_j\), and a specimen-specific intercept \(\alpha_i\). These sub-intercepts allow the model to account for varying starting values for each subject (\(\alpha_j\)), and varying sequencing effort in each specimen (\(\alpha_i\)). \[ \text{logit}(p_{ij}) = \alpha_0 + \alpha_j + \alpha_i + \beta_{\text{prev}}+ \beta_{\text{abx}_j} \] The \(\text{logit}(p_{ij})\) term in the model transforms our linear combination of predictors into a probability between 0-1 so we can use it in the binomial model above.
While we don’t know the values of these parameters yet (hence the model), we can describe a probability distribution for each of them that we expect will contain the true value based on our prior expectations.
First, we guess that our global parameters, \(\alpha_0\) and \(\beta_{\text{prev}}\), come from a normal distribution centered around 0 (i.e, they could be positive or negative). \[ \alpha_0 \sim \text{Normal}(0,\ 1) \\ \beta_{\text{prev}} \sim \text{Normal}(0,\ 1) \]
The same goes for our subject-level parameters, \(\alpha_j\) and \(\beta_{\text{abx}_j}\). Each subject will get its own normal distribution centered at 0, but the variance parameter \(\sigma\) of those distributions will be shared between subjects. This leads to partial pooling of the subject data, so that subjects with less data will be pulled towards the population median. We will model \(\sigma\) with a half-Cauchy distribution, which looks like a normal distribution but allows for more extreme values and is restricted to values greater than 0. \[ \alpha_j \sim \text{Normal}(0,\ \sigma_j) \\ \sigma_j \sim \text{Half-Cauchy}(0,\ 1) \\ \beta_{\text{abx}_j} \sim \text{Normal}(0,\ \sigma_{\text{abx}}) \\ \sigma_{\text{abx}} \sim \text{Half-Cauchy}(0,\ 1) \]
Finally, the specimen-level intercepts are also modeled the same way, with partial pooling. This means that the specimens with only a few reads (and hence higher uncertainty) are adjusted towards the median specimen intercept. \[ \alpha_i \sim \text{Normal}(0,\ \sigma_i) \\ \sigma_i \sim \text{Half-Cauchy}(0,\ 1) \]
Model v1.5.0
Straightforward implementation of the above model.
To prepare the data, we restrict it to just the sputum specimens, and add variables for the previous day’s proportional abundance.
d1.5.0.sp <- d2 %>% filter(specimen_type == "Sputum") %>%
select(subject_id=subject_id2, specimen_id2, study_day, total_reads, read_count, abx_yn, on_abx) %>%
# Remove empty or single-sample subjects
group_by(subject_id) %>%
# filter(n() > 1) %>%
arrange(subject_id, study_day) %>%
# Add lagged read and empirical proportion terms
mutate(lag_count = lag(read_count)) %>%
mutate(lag_emp_prop = lag(read_count)/lag(total_reads)) %>%
# Remove missing cases
filter(!is.na(lag_count) & !is.na(read_count)) %>%
ungroup() %>%
droplevels()
d1.5.0.spl <- compose_data(d1.5.0.sp)This plot shows the data being input into the model, as proportions. Blue indicates periods when antibiotics are considered active, as determined earlier.
And here is a slice of the data showing just MIRA_012:
.dm12 <- d1.5.0.sp %>% filter(subject_id=="MIRA_012") %>%
select(subject_id, study_day, read_count, total_reads, on_abx, lag_emp_prop)
kable(.dm12)| subject_id | study_day | read_count | total_reads | on_abx | lag_emp_prop |
|---|---|---|---|---|---|
| MIRA_012 | 1 | 12322 | 156193 | FALSE | 0.0001739 |
| MIRA_012 | 2 | 28715 | 101267 | FALSE | 0.0788896 |
| MIRA_012 | 3 | 21966 | 168130 | FALSE | 0.2835573 |
| MIRA_012 | 4 | 9420 | 118728 | FALSE | 0.1306489 |
| MIRA_012 | 7 | 7241 | 175965 | FALSE | 0.0793410 |
| MIRA_012 | 8 | 4461 | 125232 | FALSE | 0.0411502 |
| MIRA_012 | 9 | 5374 | 132877 | FALSE | 0.0356219 |
| MIRA_012 | 10 | 140 | 18171 | FALSE | 0.0404434 |
| MIRA_012 | 11 | 24541 | 298645 | FALSE | 0.0077046 |
| MIRA_012 | 14 | 19934 | 102019 | FALSE | 0.0821745 |
| MIRA_012 | 17 | 11361 | 131557 | FALSE | 0.1953950 |
| MIRA_012 | 18 | 101 | 2499 | FALSE | 0.0863580 |
| MIRA_012 | 21 | 41298 | 279284 | TRUE | 0.0404162 |
| MIRA_012 | 22 | 408 | 6405 | TRUE | 0.1478710 |
| MIRA_012 | 23 | 1771 | 11692 | FALSE | 0.0637002 |
| MIRA_012 | 24 | 3222 | 64496 | FALSE | 0.1514711 |
| MIRA_012 | 25 | 673 | 13990 | FALSE | 0.0499566 |
| MIRA_012 | 28 | 2986 | 33616 | FALSE | 0.0481058 |
| MIRA_012 | 30 | 17691 | 100509 | TRUE | 0.0888267 |
| MIRA_012 | 31 | 0 | 148 | TRUE | 0.1760141 |
| MIRA_012 | 32 | 9636 | 63630 | TRUE | 0.0000000 |
| MIRA_012 | 35 | 12769 | 67277 | FALSE | 0.1514380 |
| MIRA_012 | 36 | 14337 | 130406 | FALSE | 0.1897974 |
| MIRA_012 | 37 | 0 | 16145 | FALSE | 0.1099413 |
ggplot(.dm12, aes(x=study_day, y=read_count/total_reads, color=on_abx, group=subject_id)) +
geom_point(shape=21) +
geom_line() +
scale_alpha_manual(values=c("TRUE"=0.6, "FALSE"=0), na.value=0) +
scale_y_continuous(labels=scales::percent, expand=c(0.2,0)) +
scale_color_manual(values=c("TRUE"="dodgerblue", "FALSE"="grey40")) +
mira_theme +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none") +
labs(x="Study day", y="Proportional abundance")The Stan code for the model:
writeLines(readLines("arb_1.5.0.stan"))data{
int<lower=1> n;
int<lower=1> n_subject_id;
int read_count[n];
vector[n] lag_emp_prop; // previous day's empirical proportions
int total_reads[n];
int specimen_id2[n];
int subject_id[n];
vector[n] on_abx; // whether the subject is under the effects of abx on this timepoint
}
parameters{
vector[n] a_spec_std;
vector[n_subject_id] a_subj_std;
vector[n_subject_id] b_abx_std;
real mu;
real<lower=0> sigma_spec;
real<lower=0> sigma_subj;
real<lower=0> sigma_abx;
real b_lag;
}
model{
vector[n] prob;
// Global intercept term
mu ~ normal(-1, 1);
// Sigma and unit normal intercepts for specimen
sigma_spec ~ cauchy(0, 1);
a_spec_std ~ normal(0, 1);
// Sigma and unit normal intercepts for subject
sigma_subj ~ cauchy(0, 1);
a_subj_std ~ normal(0, 1);
// Normal prior for lag coefficient
b_lag ~ normal(0, 1);
// Normal priors for abx coefficient (no pooling)
sigma_abx ~ cauchy(0, 1);
b_abx_std ~ normal(0, 1);
// Full model
for (i in 1:n) {
prob[i] = mu + sigma_spec * a_spec_std[specimen_id2[i]] + sigma_subj * a_subj_std[subject_id[i]] +
b_lag * lag_emp_prop[i] + sigma_abx * b_abx_std[subject_id[i]] * on_abx[i];
}
read_count ~ binomial_logit(total_reads, prob);
}
generated quantities {
vector[n] a_spec = sigma_spec * a_spec_std;
vector[n_subject_id] a_subj = sigma_subj * a_subj_std;
vector[n_subject_id] b_abx = sigma_abx * b_abx_std;
}
Fitting the model:
if (params$load_saved) {
m1.5.0.sp <- readRDS("fit_arb_1.5.0_sputum.rds")
} else {
m1.5.0.sp <- stan(
file="arb_1.5.0.stan", data=d1.5.0.spl, iter=2000, chains=4,
control=list(max_treedepth=15, adapt_delta=0.9))
saveRDS(m1.5.0.sp, file="fit_arb_1.5.0_sputum.rds")
}Let’s look at the posterior estimates for the global parameters, including the \(\sigma\) terms:
m1.5.0.sp %>%
recover_types(d1.5.0.sp) %>%
gather_samples(mu, b_lag, sigma_spec, sigma_subj, sigma_abx) %>%
ggplot(aes(x = estimate, y=term, fill=0.5-abs(0.5-..ecdf..))) +
geom_vline(aes(xintercept=0), linetype=3) +
stat_density_ridges(geom="density_ridges_gradient", calc_ecdf=TRUE, scale=1.5) +
scale_fill_viridis_c("Probability", direction=1, alpha=0.7, option="C") +
scale_x_continuous(expand=c(0,0)) +
labs(x="Parameter estimate", y="Parameter", title="Global parameter estimates")There is a huge amount of variance in the sigma_subj term, indicating a wide spread of intercepts for the various subjects, and a good indication that the subject-level intercepts were warranted. There is much less effect for the sigma_abx term, suggesting that the effects of antibiotics were similar between subjects.
Next, let’s look at the estimates for the \(\beta_{\text{abx}}\) terms, which is roughly the effect of antibiotics on the abundance of the bacteria.
m1.5.0.sp %>%
recover_types(d1.5.0.sp) %>%
spread_samples(b_abx[subject_id]) %>%
ggplot(aes(x = b_abx, y=fct_rev(subject_id), fill=0.5-abs(0.5-..ecdf..))) +
stat_density_ridges(geom="density_ridges_gradient", calc_ecdf=TRUE, scale=1.5) +
geom_vline(aes(xintercept=0), linetype=2, alpha=0.4) +
scale_fill_viridis_c("Probability", direction=1, option="C") +
scale_x_continuous(expand=c(0,0), limits=c(-2,2)) +
scale_y_discrete(expand=c(0,1.5)) +
labs(x="Parameter estimate", y="Subject", title="Antibiotic term estimates")Most of these estimates are centered around zero with long tails, which suggests that there is not a strong effect of antibiotics. The one real outlier is MIRA_013.
The best way to understand these terms is to effectively simulate “new” timecourses from these estimates.
invisible(with(data.frame(), {
# browser()
.m <- m1.5.0.sp
.d <- d1.5.0.sp
.m <- recover_types(.m, .d)
post <- spread_samples(.m, mu, sigma_spec, sigma_subj, sigma_abx, b_lag,
a_spec[specimen_id2], a_subj[subject_id], b_abx[subject_id])
post$specimen_idx <- as.integer(post$specimen_id2)
post2 <- left_join(post, .d) %>% filter(!is.na(read_count))
# With specimen-specific intercepts
pp_subject_spec_plots <- lapply(X = as.character(unique(post$subject_id)),
function(s) {
print(s)
pp_subject_1.5.0(post2, s, .d, TRUE)
})
pp_subject_spec_plots <- Reduce(`+`, pp_subject_spec_plots)
plot(pp_subject_spec_plots)
# Without specimen-specific intercepts
pp_subject_plots <- lapply(X = as.character(unique(post$subject_id)), function(s) {
print(s)
pp_subject_1.5.0(post2, s, .d, FALSE)
})
pp_subject_plots <- Reduce(`+`, pp_subject_plots)
plot(pp_subject_plots)
# Subject MIRA_012 specifically:
mira12 <- pp_subject_1.5.0(post2, "MIRA_012", .d, FALSE)
plot(mira12)
}))